import requests
import time
from lxml import html
from pymongo import MongoClient


# 获取某市所有区域的链接
def get_areas(url, col):
    print('start grabing areas...')
    # 设置请求头
    headers = {
        'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.149 Safari/537.36'}
    # 获取请求页面数据
    res = requests.get(url, headers=headers)
    content = html.fromstring(res.text)
    # 获取各个区域信息
    areas = content.xpath('//div[@class="filter-by-area-container"]/ul[@class="district-wrapper"]/li/text()')
    print(areas)

    # 获取各个区域的链接
    areas_link = content.xpath(
        '//div[@class="filter-by-area-container"]/ul[@class="district-wrapper"]/li/@data-district-spell')
    print(areas_link)

    # 遍历获取所有区域的链接
    for i in range(0, len(areas)):
        area = areas[i]
        area_link = areas_link[i]
        print(area_link)
        # 拼接区域 url
        link = url + area_link
        print("开始抓取页面：" + link + " ...")
        get_pages(area, link, col)


# 通过获取某一区域的页数，来拼接该区域某一页的链接
def get_pages(area, area_link, col):
    headers = {
        'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.149 Safari/537.36'}
    res = requests.get(area_link, headers=headers)
    content = html.fromstring(res.text)
    try:
        # 链家新房页面统计每个区域的楼盘个数
        count = int(content.xpath('//div[@class ="page-box"]/@data-total-count')[0])
        # 转换成页面，获取每个页面的楼盘信息

        if count % 10:
            pages = count // 10 + 1
        else:
            pages = count // 10
        print("这个区域共有" + str(pages) + "页")
        # 抓取所有页面，但会导致链家请求压力的增加
        # for page in range(1, pages+1):
        # 只抓取1页，多了会被屏蔽，为了减小链家请求的压力
        for page in range(1, 2):
            url = area_link + '/pg' + str(page) + '/#' + area
            # 获取页面信息
            print(url)
            print("开始抓取第" + str(page) + "页的信息...")

            get_house_info(area, url, col)
    except Exception as e:
        print(e)
        time.sleep(20)


# 获取某一区域某一页的详细租房信息
def get_house_info(area, url, col):
    hlist = []
    headers = {
        'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.149 Safari/537.36'}
    # 休息10s，单次爬取要间隔时间长些，避免给服务器带来太大的压力
    time.sleep(10)
    try:
        res = requests.get(url, headers=headers)
        content = html.fromstring(res.text)

        # 链家每一个页面默认10条信息
        for i in range(10):
            try:
                # 获取小区名称
                title = content.xpath("//ul[@class='resblock-list-wrapper']/li/a/@title")[i]
                print(title)
                # 获取小区位置信息
                detail_area = content.xpath(
                    "//ul[@class='resblock-list-wrapper']/li//div[@class='resblock-location']/span[2]/text()")[i]
                print(detail_area)
                # 获取详细地址信息
                detail_place = \
                    content.xpath("//ul[@class='resblock-list-wrapper']/li//div[@class='resblock-location']/a/text()")[
                        i]
                print(detail_place)
                # 获取小区类型
                type = \
                    content.xpath(
                        "//ul[@class='resblock-list-wrapper']/li//div[@class='resblock-name']/span[1]/text()")[i]
                print(type)
                # 获取房屋面积
                try:
                    square = \
                        content.xpath(
                            "//ul[@class='resblock-list-wrapper']/li//div[@class='resblock-area']/span/text()")[i]
                except Exception as e:
                    square = ""
                print(square)
                # 获取房屋价格
                price = \
                    content.xpath("//ul[@class='resblock-list-wrapper']/li//div[@class='main-price']/span[1]/text()")[i]
                # 价格待定的楼盘设置price为0
                if price == '价格待定':
                    price = 0
                print(price)

                item = {
                    "area": area,
                    "title": title,
                    "type": type,
                    "square": square,
                    "detail_area": detail_area,
                    "detail_place": detail_place,
                    "price": int(price),
                }
                # 追加到列表
                hlist.append(item)
            except Exception as e:
                break
        print('writing work has done! continue the next page...')
        # 批量插入 MongoDB 数据库
        col.insert_many(hlist)
    except Exception as e:
        print(res.text)
        print(url)
        print('ooops! connecting error, retrying...')
        time.sleep(20)


def main():
    print('start!')
    # 需要爬取的 url
    url = 'https://wh.fang.lianjia.com/loupan/'
    # 设置 mongo 数据库
    client = MongoClient('127.0.0.1', 27017)
    db = client.get_database("lianjia")
    # 每次运行先删除原集合，保证数据不被污染
    db.get_collection("loupan").drop()
    col = db.get_collection("loupan")

    get_areas(url, col)


if __name__ == '__main__':
    main()
